require(ggplot2)
require(arm)
require(cvTools)
require(ggthemes)
require(pROC)
require(coefplot)
require(reshape2)
require(boot)
require(plyr)
require(dplyr)
require(lubridate)
require(glmnet)
require(Hmisc)
require(jsonlite)
require(zoo)
require(scales)
require(stringr)
require(zipcode)
theme_set(theme_minimal(9))
data <- read.csv('~/projects/udacity_DataR/P00000001-OH.csv', sep=',',row.names=NULL,stringsAsFactors = FALSE)
# The dataset had each column name shifted to the right one place with an additional column at the end filled with NAs.
colnames(data)[1:18] <- colnames(data)[2:19]
data <- data[1:18]
dim(data)
## [1] 129325 18
str(data)
## 'data.frame': 129325 obs. of 18 variables:
## $ cmte_id : chr "C00431171" "C00431171" "C00431171" "C00431171" ...
## $ cand_id : chr "P80003353" "P80003353" "P80003353" "P80003353" ...
## $ cand_nm : chr "Romney, Mitt" "Romney, Mitt" "Romney, Mitt" "Romney, Mitt" ...
## $ contbr_nm : chr "SMITH, STEPHEN C. MR." "SMITH, STEPHEN C. MR." "SMITH, TRAVIS" "STEINER, GREGORY" ...
## $ contbr_city : chr "FINDLAY" "FINDLAY" "UPPER ARLINGTON" "WOOSTER" ...
## $ contbr_st : chr "OH" "OH" "OH" "OH" ...
## $ contbr_zip : chr "458406817" "458406817" "432213322" "446917442" ...
## $ contbr_employer : chr "AQUA-LINE INC." "AQUA-LINE INC." "HMI INC" "FLIGHT BOSS" ...
## $ contbr_occupation: chr "BUSINESS OWNER" "BUSINESS OWNER" "SALES" "FABRICATOR" ...
## $ contb_receipt_amt: num 50 50 100 250 250 100 100 200 100 300 ...
## $ contb_receipt_dt : chr "13-AUG-12" "23-AUG-12" "23-AUG-12" "09-AUG-12" ...
## $ receipt_desc : chr "" "" "" "" ...
## $ memo_cd : chr "" "" "" "" ...
## $ memo_text : chr "" "" "" "" ...
## $ form_tp : chr "SA17A" "SA17A" "SA17A" "SA17A" ...
## $ file_num : int 896743 896743 896743 896743 896743 896743 896743 896743 896743 896743 ...
## $ tran_id : chr "SA17.1995556" "SA17.2184389" "SA17.2147759" "SA17.1850253" ...
## $ election_tp : chr "P2012" "P2012" "G2012" "P2012" ...
head(data[1:8])
## cmte_id cand_id cand_nm contbr_nm contbr_city
## 1 C00431171 P80003353 Romney, Mitt SMITH, STEPHEN C. MR. FINDLAY
## 2 C00431171 P80003353 Romney, Mitt SMITH, STEPHEN C. MR. FINDLAY
## 3 C00431171 P80003353 Romney, Mitt SMITH, TRAVIS UPPER ARLINGTON
## 4 C00431171 P80003353 Romney, Mitt STEINER, GREGORY WOOSTER
## 5 C00431171 P80003353 Romney, Mitt SUPRENANT, ALBERT DAYTON
## 6 C00431171 P80003353 Romney, Mitt SUTTER, ERIC NEW ALBANY
## contbr_st contbr_zip contbr_employer
## 1 OH 458406817 AQUA-LINE INC.
## 2 OH 458406817 AQUA-LINE INC.
## 3 OH 432213322 HMI INC
## 4 OH 446917442 FLIGHT BOSS
## 5 OH 454589145 THE LIMITED
## 6 OH 430548100 CERNER
head(data[9:18])
## contbr_occupation contb_receipt_amt contb_receipt_dt receipt_desc
## 1 BUSINESS OWNER 50 13-AUG-12
## 2 BUSINESS OWNER 50 23-AUG-12
## 3 SALES 100 23-AUG-12
## 4 FABRICATOR 250 09-AUG-12
## 5 FINANCIAL ANALYST 250 29-AUG-12
## 6 SALES 100 06-AUG-12
## memo_cd memo_text form_tp file_num tran_id election_tp
## 1 SA17A 896743 SA17.1995556 P2012
## 2 SA17A 896743 SA17.2184389 P2012
## 3 SA17A 896743 SA17.2147759 G2012
## 4 SA17A 896743 SA17.1850253 P2012
## 5 SA17A 896743 SA17.2253693 G2012
## 6 SA17A 896743 SA17.1793975 P2012
describe(data$cand_nm)
## data$cand_nm
## n missing unique
## 129325 0 14
##
## Bachmann, Michele (435, 0%)
## Cain, Herman (583, 0%), Gingrich, Newt (1432, 1%)
## Huntsman, Jon (44, 0%)
## Johnson, Gary Earl (131, 0%)
## McCotter, Thaddeus G (9, 0%)
## Obama, Barack (91442, 71%)
## Paul, Ron (4271, 3%), Pawlenty, Timothy (61, 0%)
## Perry, Rick (239, 0%)
## Roemer, Charles E. 'Buddy' III (152, 0%)
## Romney, Mitt (28474, 22%)
## Santorum, Rick (2012, 2%), Stein, Jill (40, 0%)
The data contains 19 features which makes it difficult to understand. Ideally, I would like to reduce the number of features to no more than 10. However, what I can tell is that the majority of the conbributions were made to Barack Obama (71%) and Mitt Romney (22%) with the other 12 candidates splitting the remaining 7% of contributions. I can also see that the candidate’s party is not included the dataset - it will be necessary to add this information as it is vital to answering the question in which I’m interested - To which party was the contribution made?
# Census data for Ohio
pops <- fromJSON('~/projects/udacity_DataR/ohio_city_populations.json')
# Add Population of the Contributor's City
getPopulation <- function(data){
sapply(data$contbr_city,function(x) {
ifelse(x %in% names(pops),as.numeric(pops[as.character(x)]),NA)
})
}
data$population <- getPopulation(data)
# Create a feature describing how many days from the election the contribution was made
getElectDelta <- function(data){
data$contb_receipt_dt <- as.Date(data$contb_receipt_dt, "%d-%b-%y")
election_day <- as.Date("2012-11-06")
sapply(data$contb_receipt_dt,function(x) {
return(election_day - x)
})
}
data$elect_delta <- getElectDelta(data)
# Identify the Party of each Candidate
getCandParty <- function(data) {
cand_party = list()
for (cand in unique(data$cand_nm)) {
if (cand == 'Obama, Barack') {
cand_party['Obama, Barack'] <- 'D'
}
else if (cand == 'Stein, Jill') {
cand_party['Stein, Jill'] <- 'G'
}
else if (cand == 'Johnson, Gary Earl') {
cand_party['Johnson, Gary Earl'] <- 'L'
}
else {
cand_party[as.character(cand)] <- 'R'
}
}
sapply(as.character(data$cand_nm), function(x) {
as.character(cand_party[x])
})
}
data$cand_party <- getCandParty(data)
# Zip codes should be strings
data$contbr_zip <- substr(as.character(data$contbr_zip),1,5)
# I'm not concerned with negative donations - I'm not really sure what they mean
data <- subset(data,contb_receipt_amt >0)
write.csv(data,'~/projects/udacity_DataR/data_gender_pred.csv')
# Reload the data after running the python scripts
data <- read.csv('~/projects/udacity_DataR/data_w_salary_gender.csv',stringsAsFactors = FALSE)
# making an assumption that if the contributor's name, city, and employer show up more than once, it is the same person
# this would indicate that they made multiple contributions
getMultipleContb <- function(data){
rows <- paste(data$contbr_nm, data$contbr_city, data$contbr_employer, sep=" ")
ifelse(duplicated(rows) == TRUE, 1, 0)
}
data$multiple_contb <- getMultipleContb(data)
# predicted_gender is a better variable name
data$predicted_gender <- data$gender
# check to see if the contributor identified their gender in their name
getGender <- function(data){
sapply(as.character(data), function(name) {
if(grepl("MRS.",name)){
return("female")
}
else if(grepl("MR.",name)){
return("male")
}
else if(grepl(" MS.",name)){
return("female")
}
else{
return(as.character(NA))
}
})
}
data$gender <- getGender(data$contbr_nm)
# create a feature based on if the contributor included MR. MRS., or MS. in their name
data$included_gender <- ifelse(is.na(data$gender), 0, 1)
# use predicted gender for those contributos that did not include MR., MRS., MS. in their contribution
getFinalGender <- function(data){
sapply(1:length(data[,'gender']), function(i){
if (is.na(data[i,'gender'])){
data[i,'predicted_gender']
}
else{
data[i,'gender']
}
})
}
data$predicted_gender <- getFinalGender(data)
# Adds binary feature if the contributor's zip code is within a city
# Zip codes are used because I wanted to include the surrounding areas
add_city <- function(city){
sapply(data$contbr_zip, function(zip){
if (substring(as.character(zip),1,5) %in% city){
1
}
else{
0
}
})
}
# http://www.city-data.com allowed me to search for zipcodes in specific cities
cbus <- as.character(c(43002, 43004, 43016, 43017, 43026, 43035, 43054, 43065, 43081, 43082, 43085, 43119, 43123, 43137, 43147, 43201, 43202, 43203, 43204, 43205, 43206, 43207, 43210, 43211, 43212, 43213, 43214, 43215, 43217, 43219, 43220, 43221, 43222, 43223, 43224, 43227, 43228, 43229, 43230, 43231, 43235, 43240))
cleveland <- as.character(c(44101, 44103, 44104, 44105, 44106, 44107, 44111, 44112, 44113, 44114, 44115, 44117, 44119, 44120, 44121, 44125, 44127, 44134))
cincy <- as.character(c(45202, 45203, 45204, 45205, 45206, 45207, 45208, 45209, 45212, 45214, 45216, 45217, 45219, 45220, 45223, 45224, 45225, 45226, 45227, 45229, 45230, 45231, 45232, 45239, 45243))
data$cincy <- add_city(cincy)
data$cbus <- add_city(cbus)
data$cleveland <- add_city(cleveland)
# drop rows without an estimated salary
data <- subset(data, estimated_salary!= 'No Data ')
# estimated salary was a string
data$estimated_salary <- as.numeric(data$estimated_salary)
# making an assumption that if the contributor's name, city, and employer show up more than once, it is the same person
# this would indicate that they made multiple contributions
getMultipleContb <- function(data){
rows <- paste(data$contbr_nm, data$contbr_city, data$contbr_employer, sep=" ")
ifelse(duplicated(rows) == TRUE, 1, 0)
}
data$multiple_contb <- getMultipleContb(data)
# get additional info about contribution dates
data$weekday <- weekdays(as.Date(data$contb_receipt_dt))
data$month <- month(data$contb_receipt_dt)
data$year <- year(data$contb_receipt_dt)
data$yearmon <- as.yearmon(as.Date(data$contb_receipt_dt))
# create a sample size to be used for testing/training sets later and to avoid data snooping
smp_size <- floor(0.75 * nrow(data))
## set the seed to make your partition reproductible
set.seed(1)
# create index
index <- sample(seq_len(nrow(data)), size = smp_size)
# data for exploratory analysis
data <- data[index, ]
# hold data out to avoid data snooping
held_out_data <- data[-index, ]
contb.quantile <- quantile(data$contb_receipt_amt, .95)
ggplot(aes(data$contb_receipt_amt), data=data) + geom_density() + xlim(0,contb.quantile)
ggplot(aes(log(data$contb_receipt_amt)), data=data) + geom_histogram() + scale_x_discrete(limits = c(0:round(log(contb.quantile))))
The contribution amount is not normally distributed, it is closer to a log-normal distribution, and is multi-modal. It appears that the most frequent contribution amounts were less than or equal to $100. One can also see that the data contains peaks at specific values such as 100, 250, and 500 - this aspect of the data gives it the feeling of being discrete.
##
## G2012 P P2012
## 4 42909 2 35644
election_tp tells in which election the donation was made; namely the Primary or General Election which respectively accounted for 45% and 55% of all donations.
## data$contbr_occupation
## n missing unique
## 78443 116 3824
##
## lowest : 2ND GRADE TEACHER 3D MODELER 401K SALES 4TH GRADE TEACHER A/P CLERK
## highest: YOUTH MINISTER YOUTH MINISTRY YOUTH SERVICES YOUTH SERVICES ASSOCIATE ZONE BUSINESS MANAGER
## data$contbr_employer
## n missing unique
## 78315 244 10121
##
## lowest : (SELF) GREEN LEAF LAWN CARE (SELF) STATE FARM INSURANCE COMPANIES 1 EDI SOURCE, INC 1 FINANCIAL CORPORATION 1099
## highest: ZINKAN ENTERPRISES INC. ZION SYNEK & ASSOC. ZOO GAMES INC. ZUKERMAN DAIKER & LEAR ZUKERMAN DAIKER LEAR
There are over 4,000 unique occupations and 11,000 unique employers - far too many visually look at.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 77 13840 37530 173800 297500 822600
elect_delta follows the log-normal distribution. This makes sense as American’s are typically apathetic towards politics especially when an election is months or years away. The two campaigns were also likely to have ramped up their efforts to solicit contributions as election day neared.
## data$cand_party
## n missing unique
## 78559 0 4
##
## D (56699, 72%), G (21, 0%), L (72, 0%), R (21767, 28%)
Democratic candidates received 72% of donations as opposed to the Republicans 28%. This is interesting because Ohio is typically considered a battle ground state meaning that the voting population is equally split between the two parties.
##
## female male
## 38492 40067
##
## 0 1
## 67037 11522
Out of the over 100,000 records, only about 15,000 records exist where the contributor provided an indication of their gender by using “MS.”, “MRS.”, or “MR.”. Using the way back machine, I looked at the Obama and Romney donation page on their website - neither seemed to ask for gender nor have a form for the donor to identify their gender.
Because so few contributors I need to predict the gender of the contributors that did not include MR., MRS., or MS. I attempted to use the gender package in R, but it was very slow (it never completed the task, but it had been running over 8 hours when I finally stopped it). As such, I decided to use Python’s NLTK library for the task and created a script called classify_gender.py. Running classify_gender.py creates a new csv file (data_gender_predicted.csv) with predicted gender names and prints “You classified 0.7704 correct on the test set” to stdout. After running both python scripts, I loaded the results back into R.
Trying to use all of the information provided in the original file and thinking that a person’s salary could have predictive power, I wrote a web scraping script, get_estimated_salaries.py, in Python that gets salary information for different occupations. Indeed.com allows you to enter an occupation and zip code and returns an average salary.
##
## 0 1
## 24296 54263
I made the assumption that if a contributor’s name, city, and employer show up more than once, it is the same person and this would indicate that they made multiple contributions.
I am interested to which political party the contribution was made as I will attempt to build a model to predict this. I believe some of the relevant features will be the amount contributed, when the contribution was made, the gender of the contributor, the location of the contributor, the salary of the contributor, and if the contributor made multiple contributions. It would be nice to know the age of the contributor, but this information is difficult to acquire.
data <- subset(data, cand_party == 'R' | cand_party == 'D' )
groupData <- function(initial_data,...){
# helper function to make frequent grouping of data with diffrent variables easier
gp_data <-group_by(initial_data,...)
gps_data<-summarise(gp_data,
mean_contb = mean(contb_receipt_amt),
median_contb = median(contb_receipt_amt),
sum_contb = sum(contb_receipt_amt),
mean_pop = mean(population),
median_pop = median(population),
median_elect_delta = median(elect_delta),
mean_elect_delta = mean(elect_delta),
count = n())
gps_data
}
data.by.party <- groupData(data[,1:33],cand_party)
# Comparison of contribution amount between parties
ggplot(data,aes(cand_party, log(contb_receipt_amt+1))) + geom_boxplot()
# Distribution of when contributions were made by party
ggplot(data,aes(elect_delta)) + geom_histogram(aes(fill=cand_party))
# Compares estimated salary of donors between both parties
ggplot(data, aes(cand_party,log(estimated_salary+10))) + geom_boxplot()
# Count of men & women donors by party
ggplot(data, aes(x=predicted_gender)) + geom_bar() + facet_wrap(~cand_party)
# Comparison of the distribution of contributions by weekday between parties
data$weekday <- factor(data$weekday, levels= c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
ggplot(data[order(data$weekday),],aes(weekday)) + geom_bar() + facet_wrap(~ cand_party) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(data[order(data$weekday),],aes(weekday, contb_receipt_amt)) + facet_wrap(~ cand_party) + stat_summary(fun.y = mean, geom="bar") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
###Do people that include their gender donate to D or R?
data.cand_party.include_gender <- groupData(data[,1:33],cand_party, included_gender)
ggplot(data.cand_party.include_gender,aes(included_gender, count)) + geom_bar(stat='identity') + facet_wrap(~ cand_party) + scale_x_discrete(breaks = c(0,1))
table(data[c('cand_party', 'included_gender')])
## included_gender
## cand_party 0 1
## D 56598 101
## R 10346 11421
# Comparison of population betwen parties
ggplot(data, aes(cand_party, population)) + geom_boxplot()
# Comparison of contribution amounts between paraties
ggplot(aes(x=log(contb_receipt_amt +1)), data=data) + geom_histogram(aes(fill=cand_party))
qplot(data=data, elect_delta, contb_receipt_amt) + ylim(0,quantile(data$contb_receipt_amt, .95))
ggplot(data, aes(yearmon, contb_receipt_amt)) + stat_summary(fun.y=mean, geom='line') + scale_x_yearmon() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(data, aes(yearmon, estimated_salary)) + stat_summary(fun.y=mean, geom='line') + scale_x_yearmon() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(data, aes(yearmon, population)) + stat_summary(fun.y=mean, geom='line') + scale_x_yearmon() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
It appears that contributions made to Republicans were higher than those made to Democrats; however, Democrats received a greater number of contributions. The gender breakdown shows that amongst donations made to Democrats, females made more contributions than males. Looking at Republican contributions, males made more contributions than females. This makes sense as females are more likely to identify with Democrats (http://www.people-press.org/2012/06/04/section-9-trends-in-party-affiliation/).
One of the most evident relationships was that contributions made to Republicans were higher than those made to Democrats while the opposite was true for the number of contributions. The most intriguing relationship, in my opinion, was between contributors who indicated their gender through the use of Mr, Ms, or Mrs in their name - roughly 70% of those who donated to Republicans indicated their gender while less than 1% of Democratic contributors indicated theirs. I have been unable to come up with an explanation for this - I thought that maybe Republicans included a place to indicate gender on their donation form/website, but I was unable to find any indication that this was true.
ggplot(data,aes(elect_delta)) + geom_histogram(aes(fill=election_tp)) + facet_wrap(~ cand_party)
ggplot(data,aes(yearmon, contb_receipt_amt)) + geom_point(alpha = .5, position= position_jitter(),color="gray") + geom_line(stat = "summary", fun.y = mean,aes(color=cand_party),size=1.5) + ylim(0,quantile(data$contb_receipt_amt, 0.95)) + scale_x_yearmon()
data(zipcode)
data$zip <- clean.zipcodes(data$contbr_zip)
# zipcode package would change my zipcode in 'data' variable to zip codes in MA for some reason. Subsetting only Ohio values from zipcode seemed to solve this issue
oh <- subset(zipcode, state=='OH')
map.data <- merge(subset(data, contb_receipt_amt > 0), oh, by.x='contbr_zip', by.y='zip')
map.plot <- ggplot(data=subset(map.data,cand_party == 'D' | cand_party == 'R'),aes(x=longitude, y=latitude, color=cand_party, size = contb_receipt_amt)) + geom_point(position = position_jitter(w=.08,h=.08)) + scale_size_continuous(breaks = c(50,100,500,1000,2500,5000,10000),range=c(1,15), name='Contribution Amount') + labs(x=NULL, y=NULL, color='Candidate Party') + scale_color_tableau()
map.plot <- map.plot + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.ticks=element_blank(),axis.text.x=element_blank(),axis.text.y=element_blank())
map.plot
Predicting to which party a contribution was made will require me to classify each contribution as either a Democrat or Republican (For simplicity, I’m going to drop any contributions made to third parties). As I will be tackling a binary classification problem, logistic regression is an appropriate model with which to start. Logistic regression uses a set of covariates to predict probabilities of (binary) class membership. We can then set a threshold to map these probabilities to class labels to solve the classification problem.
Logistic regression is an extension of the linear regression model with two important differences - the outcome variable and the error terms.
Outcome Variable
Key to any regression problem is the conditional mean of the outcome variable y given x. In linear regression, we assume that the conditional mean is a linear function taking values in \((-\infty, \infty)\): \[{E(y|x) =}\; \alpha + \beta{x}\] Unlike linear regression, the outcome variable of logistic regression has a conditional mean that takes values within [0,1]. In order to extend the linear regression model to logistic regression, we must map the outcome variable \(E(y|x)\) into [0,1] via a transformation called the logistic function. \[{E(y|x) =}\;\pi(x) = {e^\alpha+\beta{x}\over 1 + e^{\alpha+\beta{x}}}\] The logit function or log-odds function is a transformation of the logistic function. It can be useful in helping interpret the results. \[{g(x)=}\; \ln({\pi(x)\over {1-\pi(x)}}) = \alpha + \beta{x}\]
Error Terms
One of the key assumptions of linear regression is that the error terms follow independent Gaussian distributions with zero mean and constant variance. In logistic regression, the outcome variable can only be 0 or 1. Because of this, the error terms of logistic regression follow a Bernoulli distribution: \[\epsilon \sim \beta(0,\pi(1 - \pi))\]
In linear regression, the parameter \(\beta\) represents the change in the response variable for a unit change in the covariate. In logistic regression, the parameter \(\beta\) represents the change in the logit function for a unit change in the covariate. To interpret this change, we must define odds ratio.
The odds ratio of a binary event is given by the odds of the event divided by the odds of its complement: \[{OR =}\; {O(x=1)\over{O(x=0)}} = {\pi(i)/(1-\pi(1))\over{\pi(i)/(0-\pi(0))}}\]
Substituting the definition of \(\pi(x)\) into this equation yields (after a little algebra) \[{OR = }\; \epsilon^\beta\] The relationship between the odds ratio and \(\beta\) is what makes logistic regression such a powerful tool.
# prepare the data set
train <- data[c(11, 19, 20, 21, 22, 26, 28, 29, 30, 31, 32, 33)]
test <- held_out_data[c(11, 19, 20, 21, 22, 26, 28, 29, 30, 31, 32, 33)]
# this is America, third parties get ignored
train <- subset(train, cand_party == 'D' | cand_party == 'R')
test <- subset(test, cand_party == 'D' | cand_party == 'R')
# set D = 1, R = 0
train$cand_party <- ifelse(train$cand_party == 'D', 1, 0)
test$cand_party <- ifelse(test$cand_party == 'D', 1, 0)
# set male = 1, female = 0
train$predicted_gender <- ifelse(train$predicted_gender == 'male',1,0)
test$predicted_gender <- ifelse(test$predicted_gender == 'male',1,0)
# set general election = 1, primary election = 0
train$election_tp <- ifelse(train$election_tp == 'G2012',1,0)
test$election_tp <- ifelse(test$election_tp == 'G2012',1,0)
# fit model
fit <- glm(cand_party ~ log(contb_receipt_amt + 1) + log(estimated_salary+1) + predicted_gender + cbus + cincy + cleveland + log(elect_delta + 1) + election_tp + log(population+1) + multiple_contb + included_gender, data=train, family="binomial")
# predict results on our test set
predpr <- predict(fit, newdata=test,type="response")
test$predicted <- predpr
# plot coefficients
coefplot(fit,intercept=FALSE)
Instead of simply looking at a print out of the coefficients, I find it much easier to understand my model with a visualization. The y-axis of this plot containes all of the covariates in my model and the x-axis represents coefficients. The distance a point lies from zero is indicative of its influence in the model - the closer to zero, the less influential. Negative coefficients are associated with Republicans and positive coefficients with Democrats - this is true simply because I encoded Republicans as 0 and Democrats as 1. A negative coefficient will therefore “pull” down towards 0 and a postive coefficient will “push” up towards 1.
The most striking thing about this plot is the included_gender coefficient - it is very far away from zero on the negative side. Thinking about it, this makes sense because so many Republican contributors included their gender while nearly none of Democratic contributors included theirs. The size of this coefficient increases my suspision that there is something dubious about my included_gender variable. However, because I was unable to identify any explanations as to why Republican contributors included this information and this is a fun side project, I’ll ignore my reservations about including this variable in my model. However, if I were in this situation in a legitimate work or research setting, I would not include this variable without extensive addtional research.
There are different ways to evaluate the performance a logistic regression model. I chose to use two techniques, the receiver operating characterstic (ROC) curve and cross-validation.
A ROC curve is a graphical illustration of the performance of a binary classification system that plots the True Positive Rate against False Positive Rate.
In our ROC curve, we will see the proportion correctly identified as Democrats plotted against the proportion incorrectly identified as Democrats. Plotting the true positive rate against the false positive rate will show the ROC curve and by calculating the area under this curve (AUC), we are able to evaluate the performance of our model. A higher AUC indicates greater predictive power:
glm.roc <- roc(test$cand_party ~ predpr)
auc <- round(glm.roc$auc,2)
roc.df <- data.frame(specificities=glm.roc$specificities,sensitivities = glm.roc$sensitivities)
roc.plot <- ggplot(roc.df,aes(1-specificities, sensitivities)) + geom_line(aes(), size=0.5) + geom_abline(intercept = 0) + xlab('False Positive Rate (1 - Specificity)') + ylab('True Positive Rate (Sensitivity)') + annotate("text",x=.50,y=0.80,label=paste('The Area Under the Curve is',auc))
roc.plot
As previously mentioned, an AUC of 1 would indicate that our model has perfect predictive power. Our AUC is 0.89 - this is indicates that our model is performing very well. Next, we can perform cross-validation so that we can evaluate our model’s performance from a different perspective.
The second method I used to evaluate my model is cross-validation, specifically a K-Fold Cross-Validation. In my opinion, the results from cross-validation provide a more intuitive way to evaluate our model because it returns a simple metric - the percent of predictions that were incorrectly classified.
Cross-validation involves the following steps:
cv.fit <- cv.glm(train, fit, K=10)
cv.error <- cv.fit$delta[1]
paste("The estimate of the out-of-sample error is",cv.error)
## [1] "The estimate of the out-of-sample error is 0.0960496532264256"
Our estimated out-of-sample error is near 9% - this means that we are correctly classifying about 91% of predictions!
map.plot + ggtitle("Contribution Amount by Party and Zip Code") + theme(plot.title=element_text(size=20))
The map makes it easy to spot the most populated cities in Ohio. The center mass is Columbus and to its left, you can see a cluster which contains Dayton at the top and Cincinatti at the bottom. The top right cluster is the Cleveland area and the top left is Toledo. The size of each point indicates the amount of the contribution. Contributions to Republicans appear to be larger than contributions to Democrats. However, there are more contributions made to Democrats than Republicans.
ggplot(data,aes(yearmon, contb_receipt_amt)) + geom_point(alpha = .5, position= position_jitter(),color="gray") + geom_line(stat = "summary", fun.y = mean,aes(color=cand_party),size=1.5) + ylim(0,quantile(data$contb_receipt_amt, 0.95)) + scale_x_yearmon() + labs(x='Month and Year', y='Mean Contribution Amount', color="Candidate's Party") + ggtitle("Contribution Amount Over Time") + theme(plot.title=element_text( size=20), axis.title.x = element_text(size=16), axis.title.y = element_text(size=16))
This plot depicts the mean contribution made to each party over time. As you can see, contributions made to Republicans were consistently higher than contributions made to Democrats. Further, the mean contribution amount for both parties decreased as it got closer to the election.
ggplot(data,aes(elect_delta)) + geom_histogram(aes(fill=cand_party)) + labs(x='Days From Election', y='Count', fill="Candidate's Party") + ggtitle("Number of Contributions Over Time") + theme(plot.title=element_text( size=20), axis.title.x = element_text(size=16), axis.title.y = element_text(size=16))
The histogram reveals that as it got closer to election time, both Democrats and Republicans received a greater volume of contributions. This makes sense as Americans are typically apathetic towards politics, especially when an election is a year or more away. However, as the election got closer, compaigns would have ramped up their efforts to solicit contributions and people likely started paying more attention to the election.
Overall, I was satisfied with the results I was able to obtain from my analysis. The most difficult part of the analysis was figuring out ways to extract meaning from the data - this meant that I often had to use the existing data to create new features based on it. However, I feel that it was a great experience because it provided me the opportunity to do a different sort of data wrangling than the typical data cleaning - I actually had to create new data or go out and find new data that would be helpful in my analysis.
As I previously mentioned when discussing my logistic regression model, the “included_gender” variable was my greatest concern because it seemed odd that so many Republican contributors indicated their gender while so few Democrats did. When I ran the analysis without the “included_gender” variable, the ACU dropped to 0.78 which is respectable, but certainly no where near 0.89 that was obtained by including the variable. Clearly, this analysis would not have been appropriate in an academic setting because I included features that were estimates, such as the predicted_gender and estimated_salary. Because my goal was simply to make the best predictions possible, I did not have an issue including the aforementioned features.
If I were to continue this analysis or if I were working for a political campaign, I think I would change my focus from classifying which party the contribution was made to, to predicting the amount of the contribution. I feel that being able to predict the amount that a person will contribute could be used by campaigns to assist with their financial forecasting - something that must be difficult when all of your funding relies on donations.